

/*
Clean Safegraph POI visits data
*/

* ++++++++++++++++++++++
*** POI to NAICS and zip crosswalk
* ++++++++++++++++++++++

* POI to NAICS codes crosswalk
forval i = 1/5 {
	import delimited "${data_raw_poi}/CoreApr2020Release-CORE_POI-2020_03-2020-04-07/core_poi-part`i'.csv", clear
	keep safegraph_place_id naics_code postal_code
	tempfile part`i'
	save `part`i''
	}
	
use `part1', clear
forval i = 2/5 {
	append using `part`i''
	}	

preserve 
drop postal_code
isid safegraph_place_id
save "${data_derived_poi}/poi_naics_cw.dta", replace
restore

* POI to zip crosswalk by industry
* highlight certain key types of places 
* restrict places to these and highlight their industry
* source: https://www.census.gov/eos/www/naics/2017NAICS/2017_NAICS_Manual.pdf
gen naics_string = string(naics)
gen naics_short = substr(naics_string, 1,2)

preserve 
	keep if substr(naics_string, 1,3) == "722"
	replace naics_short = "food_places"
	tempfile food_places
	save `food_places'
restore

preserve 
	keep if substr(naics_string, 1,3) == "445"
	replace naics_short = "food_stores"
	tempfile food_stores
	save `food_stores'
restore

preserve 
	keep if (substr(naics_string, 1,2) == "44" | substr(naics_string, 1,2) == "45") ///
		& substr(naics_string, 1,3) != "445"
	replace naics_short = "other_retail_trade"
	tempfile other_retail_trade
	save `other_retail_trade'
restore

preserve 
	keep if substr(naics_string, 1,6) == "712190"
	replace naics_short = "parks"
	tempfile parks
	save `parks'
restore

preserve 
	keep if substr(naics_string, 1,3) == "611"
	replace naics_short = "schools"
	tempfile schools
	save `schools'
restore

preserve 
	keep if substr(naics_string, 1,2) == "71"
	replace naics_short = "entertainment"
	tempfile entertainment
	save `entertainment'
restore

preserve 
	keep if substr(naics_string, 1,2) == "62"
	replace naics_short = "health_care"
	tempfile health_care
	save `health_care'
restore

append using `food_places'
append using `food_stores'
append using `other_retail_trade'
append using `parks'
append using `schools'
append using `entertainment'
append using `health_care'

* collapse --> count number of pois by type by zcta
gen number_poi = 1
ren postal_code zcta
collapse (rawsum) number_poi, by(zcta naics_short)

save "${data_derived_poi}/number_of_pois_by_zip_by_ind.dta", replace

* ++++++++++++++++++++++
*** POI patterns by day by industry
* all files separately --> combine them in next section
* ++++++++++++++++++++++

* define program to clean POI data 
* you first need to load a file and then you can run this program
cap program drop clean_poi
program define clean_poi
	merge 1:1 safegraph_place_id using "${data_derived_poi}/poi_naics_cw.dta", ///
		nogen keep(3)
	
	* which week are we talking about? 
	gen start_date = date(substr(date_range_start,1,10), "YMD")
	qui: su start_date
	local start_date = `r(mean)'
	keep safegraph_place_id postal_code visits_by_day naics

	* extract information by day
	replace visits_by_day = substr(visits_by_day, 2, .)
	replace visits_by_day = regexr(visits_by_day, "]", "")
	split visits_by_day, parse(",")
	drop visits_by_day
	destring visits_by_day*, replace force
	ren visits_by_day* num_visits*
	
	* reshape long on days
	reshape long num_visits, i(safegraph_place_id) j(day)
	
	* create week variable
	replace day = day + `start_date'-1
	format day %td
	
	end
	
	
* files from Jan 6 - June 14
local files_old_format "01-06 01-13 01-20 01-27 02-03 02-10 02-17 02-24 03-02 03-09 03-16 03-23 03-30 04-06 04-13 04-20 04-27 05-04 05-11 05-18 05-25 06-01 06-08"
foreach d in `files_old_format' {
	import delimited "${data_raw_weekly_places}/2020-`d'-weekly-patterns", clear
	clean_poi
	
	* collapse over NAICS codes and weeks
	gen week = week(day)
	collapse (rawsum) num_visits, by(postal_code naics week)
	ren postal_code zcta
	
	* save
	save "${data_derived_poi}/zip_poi_visits_by_ind_`d'.dta", replace
	}
	
	
* files for later weeks have a different format so we need to clean them separately 
local files_new_format "06_24 07_01 07_08 07_15 07_22 07_29 08_05"
foreach d in `files_new_format' {
	forval i = 1/4 {
		import delimited "${data_raw_weekly_places}/`d'/patterns-part`i'", clear
		tempfile part`i'
		save `part`i''
		}
	use `part1', clear 
	forval i=2/4 {
		append using `part`i''
		}
	clean_poi
	
	* collapse over NAICS codes and weeks
	gen week = week(day)
	collapse (rawsum) num_visits, by(postal_code naics week)
	ren postal_code zcta
	
	* save
	save "${data_derived_poi}/zip_poi_visits_by_ind_`d'.dta", replace
	}
	
	
* beginning of January comes from file that is again in different format
forval i = 1/4 {
	import delimited "${data_raw_weekly_places}/Jan2020/patterns-part`i'", clear
	tempfile part`i'
	save `part`i''
	}
use `part1', clear 
forval i=2/4 {
	append using `part`i''
	}
clean_poi
* focus on days not included in later files
gen day_only = day(day)
drop if day_only >=6
* collapse over NAICS codes and weeks
gen week = week(day)
collapse (rawsum) num_visits, by(postal_code naics week)
ren postal_code zcta

* save
save "${data_derived_poi}/zip_poi_visits_by_ind_01_01.dta", replace


* ++++++++++++++++++++++
*** Combine data and highlight key POI
* ++++++++++++++++++++++

* append data
use "${data_derived_poi}/zip_poi_visits_by_ind_01_01.dta", clear
foreach d in `files_old_format' `files_new_format' {
	append using "${data_derived_poi}/zip_poi_visits_by_ind_`d'.dta"
	}
	
* there might be duplicates because week cutoff in files doesn't align with cutoffs in stata
* --> collapse again
collapse (rawsum) num_visits, by(zcta naics week)
	
* highlight certain key types of places 
* restrict places to these and highlight their industry
* source: https://www.census.gov/eos/www/naics/2017NAICS/2017_NAICS_Manual.pdf
gen naics_string = string(naics)
gen naics_short = substr(naics_string, 1,2)
preserve 
	keep if substr(naics_string, 1,3) == "722"
	replace naics_short = "00"
	tempfile food_places
	save `food_places'
restore

preserve 
	keep if substr(naics_string, 1,3) == "445"
	replace naics_short = "01"
	tempfile food_stores
	save `food_stores'
restore

preserve 
	keep if (substr(naics_string, 1,2) == "44" | substr(naics_string, 1,2) == "45") ///
		& substr(naics_string, 1,3) != "445"
	replace naics_short = "01a"
	tempfile other_retail_trade
	save `other_retail_trade'
restore

preserve 
	keep if substr(naics_string, 1,6) == "712190"
	replace naics_short = "02"
	tempfile parks
	save `parks'
restore

preserve 
	keep if substr(naics_string, 1,3) == "611"
	replace naics_short = "03"
	tempfile schools
	save `schools'
restore

preserve 
	keep if substr(naics_string, 1,2) == "71"
	replace naics_short = "04"
	tempfile entertainment
	save `entertainment'
restore

preserve 
	keep if substr(naics_string, 1,2) == "62"
	replace naics_short = "05"
	tempfile health_care
	save `health_care'
restore

append using `food_places'
append using `food_stores'
append using `other_retail_trade'
append using `parks'
append using `schools'
append using `entertainment'
append using `health_care'

* collapse to 2-digit NAICS codes
collapse (rawsum) num_visits, by(zcta week naics_short)

* now highlight the industries saved before in special way
replace naics_short = "food_places" if naics_short =="00"
replace naics_short = "food_stores" if naics_short =="01"
replace naics_short = "other_retail_trade" if naics_short =="01a"
replace naics_short = "parks" if naics_short =="02"
replace naics_short = "schools" if naics_short =="03"
replace naics_short = "entertainment" if naics_short =="04"
replace naics_short = "health_care" if naics_short =="05"

* make panel balanced
* account for the fact that not all industry codes for all dates for all zip codes
* if something doesn't exist anymore at later day, impute zero visits (e.g. social
* distancing led to complete closure of the place)
egen date_num = group(week)
qui: su date_num
local max = `r(max)'

preserve 
	bys zcta naics_short: keep if _n==1
	keep zcta naics_short
	expand `max'
	bys zcta naics_short : gen date_num = _n
	tempfile expanded
	save `expanded'
restore

merge 1:1 zcta naics_short date_num using `expanded'

* highlight the imputation
gen impute_zero = _merge ==2
gen exist_at_baseline = _merge ==3
replace num_visits = 0 if impute_zero ==1 

* also backfill the week if missing (because of the imputation)	
qui: su date_num
forval i =`r(min)'/`r(max)' {
	qui: su week if date_num ==`i'
	local week = `r(mean)'
	qui: replace week = `week' if date_num == `i' & week == .
	}	

* clean up
drop _merge date_num 

* merge on number of POIs
merge m:1 zcta naics_short using ///
	"${data_derived_poi}/number_of_pois_by_zip_by_ind.dta", ///
	assert(1 2 3) keep(3) keepusing(number_poi) nogen
	
* save
save "${data_derived_poi}/zip_poi_visits_by_ind_weekly.dta", replace

